Does cryptic genetic variation contribute to the adaptive divergence of freshwater populations of threespine stickleback? Let’s get that genomic data to start figuring it out.
This log describes the initial exploratory data analysis from a subset of the libraries in the plasticity project.
# prep the environment
require(tidyverse)
theme_set(theme_classic())
require(vcfR)
require(whoa)
require(cowplot)
Conclusion Using standard filtering + coverage (gt depth >5 and mean site depth > 10), missingessness filtering (iterative up to geno >95% and individual up to 50% missing data), and paralog filtering (high coverage, allele balance), resulted ~150k rad loci with mean coverage ~14x and heterozygote miscall rate of about 2% (Ho vs He)
Note: some files from this analysis have been deleted from the github repo, so this notebook will not run and should be viewed as the rendered html notebook
plastic #top directory seq_data #all sequencing data stacks #all input (except raw sequence data) and output of stacks info #all pop_maps alignments #all alignments (sorted bam files output from bowtie2) genome #bowtie indices and reference genome genotypes #output of gstacks and populations slurm #all stacks slurm jobs cleaned #cleaned radtags
for processradtags each set of PE data needs to be in its own directory, write shell script to accomplish this
for f in *.fastq.gz; do
name=`echo "$f"|sed 's/_R[12]_001.fastq.gz//'`
dir="$name"
mkdir -p "$dir"
mv "$f" "$dir"
done
Used a python script to split the file ‘master_barcode_key.txt’ into separate files for each lane.
""" The input file has four columns, this script takes writes columns 2 and 3 (barcode and individual) to a new file based on the value of column 4."""
import csv
with open('/Users/ddayan/Science/plasticity/analysis/bioinformatics/metadata/master_barcode_key.csv') as fin:
csvin = csv.DictReader(fin)
# Category -> open file lookup
outputs = {}
for row in csvin:
cat = row['library']
# Open a new file and write the header
if cat not in outputs:
fout = open('{}.csv'.format(cat), 'w')
dw = csv.DictWriter(fout, fieldnames=csvin.fieldnames)
dw.writeheader()
outputs[cat] = fout, dw
# Always write the row
outputs[cat][1].writerow(row)
# Close all the files
for fout, _ in outputs.values():
fout.close()
oops, only meant to keep columns 2 and 3 and need to write to tab delimited file
for i in ./*csv
do
cut -d "," -f 2,3 $i > ${i%.csv}.tmp
done
for i in ./*tmp
do
tr "," "\\t" < $i > ${i%.tmp}_barcodes.txt
done
after raw reads go through QC and are demultiplexed and concatenated they are aligned to the genome alignment used BWA-mem with default options against the broad_s1 g aculeautus genome downloaded from ensembl
two slurm scripts are below for bwa mem alignments: single lane and batch submission
#!/bin/bash
# set max wall-clock time (D-HH:MM:SS)
#SBATCH --time=6-23:59:00
#SBATCH --cpus-per-task=38
# set partition/queue to use
#SBATCH --partition=week-long-cpu
# set name of job
#SBATCH --job-name=bwa_default
# set name of output file
#SBATCH --output=bwadefault.out
# send mail to this address
#SBATCH --mail-user=ddayan@clarku.edu
source /opt/samtools-1.6/source_me
files="bb_f_10
bb_f_110
bb_f_119
bb_f_121
bb_f_129
bb_f_131
bb_f_157
bb_f_161
bb_f_165
bb_f_168
bb_f_170
bb_f_174
bb_f_181
bb_f_183
bb_f_186
bb_f_189
bb_f_19
bb_f_191
bb_f_193
bb_f_20
bb_f_213
bb_f_218
bb_f_221
bb_f_231
bb_f_236
bb_f_237
bb_f_239
bb_f_24
bb_f_282
bb_f_288
bb_f_316
bb_f_320
bb_f_4
bb_f_41
bb_f_43
bb_f_46
bb_f_5
bb_f_56
bb_f_66
bb_f_82
bb_f_99
bb_s_1
bb_s_104
bb_s_108
bb_s_114
bb_s_116
bb_s_126
bb_s_13
bb_s_133
bb_s_135
bb_s_141
bb_s_143
bb_s_151
bb_s_153
bb_s_159
bb_s_167
bb_s_170
bb_s_172
bb_s_175
bb_s_18
bb_s_185
bb_s_193
bb_s_20
bb_s_204
bb_s_205
bb_s_206
bb_s_207
bb_s_209
bb_s_210
bb_s_216
bb_s_220
bb_s_225
bb_s_237
bb_s_241
bb_s_246
bb_s_250
bb_s_256
bb_s_262
bb_s_273
bb_s_277
bb_s_283
bb_s_284
bb_s_287
bb_s_292
bb_s_300
bb_s_311
bb_s_314
bb_s_321
bb_s_45
bb_s_68
bb_s_75
bb_s_77
bb_s_81
bb_s_86
bb_s_89
cl_f_119
cl_f_12
cl_f_121
cl_f_126
cl_f_139
cl_f_141
cl_f_142
cl_f_146
cl_f_148
cl_f_154
cl_f_155
cl_f_159
cl_f_175
cl_f_177
cl_f_187
cl_f_192
cl_f_197
cl_f_20
cl_f_21
cl_f_214
cl_f_226
cl_f_234
cl_f_236
cl_f_25
cl_f_260
cl_f_265
cl_f_269
cl_f_270
cl_f_272
cl_f_276
cl_f_286
cl_f_288
cl_f_296
cl_f_312
cl_f_314
cl_f_34
cl_f_343
cl_f_348
cl_f_38
cl_f_41
cl_f_46
cl_f_52
cl_f_53
cl_f_55
cl_f_56
cl_f_68
cl_f_76
cl_f_79
cl_f_85
cl_f_88
cl_s_104
cl_s_110
cl_s_113
cl_s_115
cl_s_117
cl_s_119
cl_s_129
cl_s_142
cl_s_147
cl_s_148
cl_s_154
cl_s_17
cl_s_179
cl_s_188
cl_s_189
cl_s_19
cl_s_190
cl_s_199
cl_s_20
cl_s_215
cl_s_23
cl_s_234
cl_s_236
cl_s_249
cl_s_257
cl_s_26
cl_s_27
cl_s_284
cl_s_30
cl_s_308
cl_s_310
cl_s_32
cl_s_34
cl_s_35
cl_s_37
cl_s_40
cl_s_43
cl_s_47
cl_s_57
cl_s_58
cl_s_71
cl_s_77
cl_s_8
cl_s_9
cl_s_96
lb_f_100
lb_f_109
lb_f_114
lb_f_119
lb_f_134
lb_f_137
lb_f_142
lb_f_143
lb_f_146
lb_f_150
lb_f_151
lb_f_172
lb_f_178
lb_f_179
lb_f_185
lb_f_191
lb_f_197
lb_f_202
lb_f_206
lb_f_214
lb_f_220
lb_f_226
lb_f_230
lb_f_232
lb_f_237
lb_f_239
lb_f_251
lb_f_253
lb_f_255
lb_f_256
lb_f_275
lb_f_284
lb_f_305
lb_f_49
lb_f_57
lb_f_81
lb_f_91
lb_f_96
lb_s_110
lb_s_125
lb_s_129
lb_s_13
lb_s_135
lb_s_138
lb_s_148
lb_s_151
lb_s_159
lb_s_170
lb_s_171
lb_s_176
lb_s_184
lb_s_19
lb_s_204
lb_s_207
lb_s_21
lb_s_211
lb_s_226
lb_s_231
lb_s_239
lb_s_242
lb_s_245
lb_s_248
lb_s_257
lb_s_269
lb_s_280
lb_s_292
lb_s_295
lb_s_309
lb_s_313
lb_s_38
lb_s_39
lb_s_4
lb_s_41
lb_s_50
lb_s_62
lb_s_63
lb_s_72
lb_s_77
lb_s_80
lb_s_82
lb_s_90
rs_f_10
rs_f_114
rs_f_115
rs_f_120
rs_f_13
rs_f_130
rs_f_135
rs_f_137
rs_f_138
rs_f_144
rs_f_152
rs_f_157
rs_f_158
rs_f_160
rs_f_168
rs_f_178
rs_f_18
rs_f_183
rs_f_185
rs_f_186
rs_f_189
rs_f_203
rs_f_204
rs_f_205
rs_f_210
rs_f_214
rs_f_215
rs_f_217
rs_f_228
rs_f_233
rs_f_236
rs_f_240
rs_f_241
rs_f_257
rs_f_262
rs_f_263
rs_f_270
rs_f_272
rs_f_290
rs_f_31
rs_f_34
rs_f_42
rs_f_45
rs_f_46
rs_f_5
rs_f_53
rs_f_54
rs_f_66
rs_f_79
rs_f_85
rs_f_87
rs_s_10
rs_s_106
rs_s_113
rs_s_118
rs_s_12
rs_s_120
rs_s_125
rs_s_127
rs_s_136
rs_s_140
rs_s_15
rs_s_152
rs_s_164
rs_s_166
rs_s_169
rs_s_170
rs_s_172
rs_s_175
rs_s_178
rs_s_182
rs_s_188
rs_s_196
rs_s_212
rs_s_219
rs_s_223
rs_s_226
rs_s_240
rs_s_242
rs_s_244
rs_s_247
rs_s_250
rs_s_253
rs_s_258
rs_s_267
rs_s_27
rs_s_278
rs_s_283
rs_s_285
rs_s_287
rs_s_290
rs_s_291
rs_s_30
rs_s_302
rs_s_306
rs_s_31
rs_s_316
rs_s_321
rs_s_33
rs_s_331
rs_s_344
rs_s_35
rs_s_353
rs_s_356
rs_s_36
rs_s_360
rs_s_363
rs_s_369
rs_s_371
rs_s_379
rs_s_381
rs_s_383
rs_s_4
rs_s_44
rs_s_64
rs_s_65
rs_s_69
rs_s_73
rs_s_76
rs_s_8
rs_s_83
rs_s_84
rs_s_85
rs_s_87
"
#
# Align paired-end data with Bpwtie2, convert to BAM and SORT.
#
for sample in $files
do
/opt/bio-bwa/bwa mem -t 38 ../genome/bwa_gac ../cleaned/${sample}.1.fq.gz ../cleaned/${sample}.2.fq.gz | /opt/samtools-1.6/bin/samtools view -@ 16 -bSu - | /opt/samtools-1.6/bin/samtools sort -@ 16 - -o ./${sample}.bam &> bwa_mem.oe
done
for all the files in a directory - does not work yet…
#!/bin/bash
# set max wall-clock time (D-HH:MM:SS)
#SBATCH --time=6-23:59:00
#SBATCH --cpus-per-task=38
# set partition/queue to use
#SBATCH --partition=week-long-cpu
# set name of job
#SBATCH --job-name=bwa_default
# set name of output file
#SBATCH --output=bwadefault.out
# send mail to this address
#SBATCH --mail-user=ddayan@clarku.edu
source /opt/samtools-1.6/source_me
files=find . -name "*.fq.gz" -exec basename \{} .fq.gz \; | sed 's/\.[12]//' | sed 's/.rem//' | uniq
for sample in $files
do
/opt/bio-bwa/bwa mem -t 38 ../genome/bwa/bwa_gac ../cleaned_run2_lane8/${sample}.1.fq.gz ../cleaned_run2_lane8/${sample}.2.fq.gz | /opt/samtools-1.6/bin/samtools view -@ 16 -bSu - | /opt/samtools-1.6/bin/samtools sort -@ 16 - -o ./${sample}.bam &> bwa_mem.oe
done
time estimate: for single lane 3 hours
#!/bin/bash
# set max wall-clock time (D-HH:MM:SS)
#SBATCH --time=6-23:59:00
#SBATCH --cpus-per-task=38
# set partition/queue to use
#SBATCH --partition=week-long-cpu
# set name of job
#SBATCH --job-name=gstacks12-8
# set name of output file
#SBATCH --output=gstacks12-8.out
# mail alert at start, end and abortion of execution
#SBATCH --mail-type=ALL
# send mail to this address
#SBATCH --mail-user=ddayan@clarku.edu
source /opt/stacks-2.3/bin/source_me
/opt/stacks-2.3/bin/gstacks -I ../alignments/ -M ../info/../info/popmap_first12-8.txt --rm-pcr-duplicates -O ./ -t 38 &> gstacks_log_12-8.oe
#!/bin/bash
# set max wall-clock time (D-HH:MM:SS)
#SBATCH --time=6-23:59:00
#SBATCH --cpus-per-task=38
# set partition/queue to use
#SBATCH --partition=week-long-cpu
# set name of job
#SBATCH --job-name=popfiltfirst12-8
# set name of output file
#SBATCH --output=popfiltfirst12-8.out
# mail alert at start, end and abortion of execution
#SBATCH --mail-type=ALL
# send mail to this address
#SBATCH --mail-user=ddayan@clarku.edu
source /opt/stacks-2.3/bin/source_me
/opt/stacks-2.3/bin/populations --in-path ./ -M ../info/popmap_first12-8.txt -t 38 -r 0.95 --vcf --merge_sites -e pstI &> lane8_popfilt.oe
Rationale:
Ran the pipeline above for subset of libraries (68, 2-7, 9-12 - logged with “12-8” prefix). Then used this subset of the data (~15% of data and ~100 fish from each population) to examine coverage and decide on filtering parameters for the final run
first we look at coverage of the data going into stacks, the bam files
pulled up the bamstats from log output of gstacks (“gstacks.log.distribs”), then edited file with regex include a population flag
bamstats <- read_tsv("log_files/12-8_run/12-8bamstats.txt")
ggplot(data = bamstats)+geom_density(aes(x=primary_kept))+geom_vline(aes(xintercept = median(bamstats$primary_kept)), color = "red")+ggtitle("Retained record from BAM input")
bamstats %>%
group_by(pop) %>%
summarise(median_forward_reads = median(primary_kept), mean_kept = mean(kept_frac) )
Here we see no issue with variation in mapping quality across the populations and good evidence that we got more than the number of reads we were looking for per individual. ~13 million total reads per samples (or 7.5million pairs), which if we have about 350k rad loci works out to about 20x coverage before filtering sites (but after filtering for bad reads in the process_radtags fork).
We lost about 20% of the reads in the bam files to default quality filters in the gstacks fork (minmapq 10, max-clipped 0.20, unmapped reads) summary below:
Read 6556590170 BAM records:
kept 5322753707 primary alignments (83.4%), of which 2628057004 reverse reads
skipped 423599169 primary alignments with insufficient mapping qualities (6.6%)
skipped 455204465 excessively soft-clipped primary alignments (7.1%)
skipped 179611285 unmapped reads (2.8%)
skipped some suboptimal (secondary/supplementary) alignment records
At this point the expected coverage is ~ 17x (2433922417 paired reads per 395 samples (about 6 million reads per sample) per estimated 350k rad loci)
similar to above we pulled the distribution of effective coverage from the stacks logs, edited to include a population flag and examined below
First, the summary output from gstacks.log:
Built 1000251 loci comprising 2694696703 forward reads and 2433922417 matching paired-end reads; mean insert length was 244.1 (sd: 75.2).
Removed 260774286 unpaired (forward) reads (9.7%); kept 2433922417 read pairs in 962035 loci.
Removed 796945198 read pairs whose insert length had already been seen in the same sample as putative PCR duplicates (32.7%); kept 1636977219 read pairs.
Genotyped 962035 loci:
effective per-sample coverage: mean=9.4x, stdev=5.1x, min=1.0x, max=36.9x
mean number of sites per locus: 523.7
a consistent phasing was found for 36695500 of out 38359065 (95.7%) diploid loci needing phasing
Quick summary: 33% PCR duplicates + vastly more rad loci than expected leads to drastic reduction in coverage… but lets look more closely
eff_cov <- read_tsv("log_files/12-8_run/effective_cov_12-8.txt")
a <- ggplot(data=eff_cov)+geom_density(aes(x= n_loci))+geom_vline(aes(xintercept=median(eff_cov$n_loci)), color = "red")+ggtitle("Number of Rad Loci")
b <- ggplot(data=eff_cov)+geom_density(aes(x= mean_cov_ns))+geom_vline(aes(xintercept=median(eff_cov$mean_cov_ns)), color = "red")+ggtitle("weighted mean coverage")
c <- ggplot(data=eff_cov)+geom_density(aes(x= pcr_dupl_rate))+geom_vline(aes(xintercept=median(eff_cov$pcr_dupl_rate)), color = "red")+ggtitle("pcr duplicates")
d <- ggplot(data = eff_cov)+geom_point(aes(pcr_dupl_rate, mean_cov_ns), alpha = 0.1) +geom_smooth(aes(pcr_dupl_rate, mean_cov_ns), method = "lm")
plot_grid(a,b,c,d)
rm(a,b,c,d)
At this point, things are not looking great, about 50% of samples have less than 10x effective coverage, and this seems largely to be caused by (1) a great increase in the number of rad loci realtive to what was expected given other papers numbers for the number of PstI cut sites and my own in silico digest and (2) PCR duplicates take up anywhere from ~18% to 60% of the reads depending on the library (it’s clear that the variation in pcr duplication rate is explained by variation at the level of library prep given the clustering in the last plot above).
quick note: a previous examination of PCR duplication reveals that only 0.6% of duplicates appear to be optical duplicates, suggesting that library complexity is the root issue here, not sequencing.
The good news:
if there are actually >500k PstI cut sites in the stickleback genome, than we have more sites than we need to effectively cover the whole genome - In the worst case from the literature (Roesti et al 2015), LD decays at 1kb, and the stickleback genome is 447mb, so achieving full genomewide coverage requires 450k radloci (IN THE WORST POSSIBLE CASE), therefore we can probably lose some of the poorly sequenced sites and individuals to improve depth and still have markers that cover the entire genome!
To look at things at this level we will have to examine coverage per site per individual, instead of these summary statistics, this is below
after looking at BAMstats and the effective coverage summary stats from gstacks, we look at the coverage of data only filtered by populations fork of stacks
this section covers the first fun of populations (-r 0.95 i.e. only retain variable sites if rad locus is covered in in 95% of sample within a population)
How many of the rad loci are “bad” (i.e. absent int >5% of samples within a population)?
>Removed 653042 loci that did not pass sample/population constraints from 962035 loci.
Kept 308993 loci, composed of 248618752 sites; 2268754 of those sites were filtered, 838333 variant sites remained.
221216030 genomic sites, of which 26716049 were covered by multiple loci (12.1%). Mean genotyped sites per locus: 795.32bp (stderr 0.28).
How does coverage look now?
#first capture the depth per locus per individual using vcftools
#!/bin/bash
# set max wall-clock time (D-HH:MM:SS)
#SBATCH --time=0-23:59:00
#SBATCH --cpus-per-task=1
# set partition/queue to use
#SBATCH --partition=day-long-cpu
# set name of job
#SBATCH --job-name=depths
# set name of output file
#SBATCH --output=depths.out
/opt/vcftools_0.1.13/vcftools --vcf /home/ddayan/plastic/stacks/genotypes/populations.snps.vcf --depth
/opt/vcftools_0.1.13/vcftools --vcf /home/ddayan/plastic/stacks/genotypes/populations.snps.vcf --site-depth
/opt/vcftools_0.1.13/vcftools --vcf /home/ddayan/plastic/stacks/genotypes/populations.snps.vcf --site-mean-depth
/opt/vcftools_0.1.13/vcftools --vcf /home/ddayan/plastic/stacks/genotypes/populations.snps.vcf --geno-depth
#then randomly sample this geno-depth output to a reasonable size for plotting (50k snps)
shuf -n 10000 out.gdepth > gdepth_10k
( sed 1q out.gdepth; sed 1d gdepth_10k ) > gdepth_10k.txt
gdepths <- read_tsv("log_files/12-8_run/gdepth_10k.txt")
idepth <- read_tsv("log_files/12-8_run/out.idepth")
ldepthmean <- read_tsv("log_files/12-8_run/out.ldepth.mean")
gs <- as.data.frame(unlist(gdepths[c(3:397)]))
colnames(gs) <- "depth"
a <- ggplot(data=idepth)+geom_density(aes(x=MEAN_DEPTH))+geom_vline(aes(xintercept = median(MEAN_DEPTH)))+ggtitle("Mean Depth per Locus per Individual")
b <- ggplot(data=ldepthmean)+geom_density(aes(x=MEAN_DEPTH))+geom_vline(aes(xintercept = median(MEAN_DEPTH)))+ggtitle("Mean Depth per Locus")+scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
c <- ggplot()+geom_density(aes(x=gs$depth))+scale_x_continuous(breaks = scales::pretty_breaks(n = 10))+xlim(0,50)
plot_grid(a,b,c)
Something not good is going on here, when we look at the SNP level more than 50% of the sites have > ~12x coverage, suggesting some coverage based filtering could produce a sizeable snp dataset to conduct our analyses on, but when we look at the SNPxIndividual level we see that many SNPs have 0 depth. Let’s look more closely…
#lets make a clustered heatmap of coverage split across populations
smallgs <- sample_n(gdepths, 1000)
# smallgs$ID <- seq.int(nrow(smallgs))
#
#
# long_gs <- gather(gdepths, individual, depth, bb_f_10:rs_s_87)
# long_gs <- long_gs[complete.cases(long_gs),]
#
# small_long_gs <- gather(smallgs, individual, depth, bb_f_10:rs_s_87)
# small_long_gs <- small_long_gs[complete.cases(small_long_gs),]
# samples <- sample(colnames(gdepths[,3:397]), 40)
# small_long_gs <- subset(small_long_gs, individual %in% samples)
# small_long_gs$pop <- substr(small_long_gs$individual, start=1, stop = 2)
#
# #now lets reorder the rows according to clustering
# hc <- hclust(dist(as.matrix(small_long_gs[,c("depth", "ID", "individual")])))
# ord <- order.dendrogram(as.dendrogram(hc))
# small_long_gs$ID <- factor(x = small_long_gs$ID, levels = unique(small_long_gs$ID[ord]), ordered = TRUE)
#
#
# ggplot(data=small_long_gs)+geom_tile(aes(x=individual,y=ID, fill=depth, group = pop))+scale_x_discrete(expand=c(0,0))+scale_y_discrete(expand=c(0,0))+scale_fill_viridis_c()+facet_grid(. ~ pop, scales = "free")
#all that was very nice, but it turns out theres a great package that will do a better job of that
require(heatmaply)
samples <- sample(colnames(gdepths[,3:397]), 200)
smallergs <- smallgs[,samples]
smallergs <- t(smallergs)
smallergs <- smallergs[order(row.names(smallergs)),]
smallergs <- t(smallergs)
heatmaply(smallergs, Colv=FALSE, col_side_colors = substr(colnames(smallergs),start=1, stop = 2), titleX = FALSE, titleY=FALSE)